#' ---
#' title: Create table of examples where lexical string distance is misleading
#' author: Joe Ornstein
#' date: 2024-09-13
#' version: 0.11
#' ---

from_file <- TRUE # to retrieve embeddings from API, set FALSE

library(tidyverse)
library(stringdist)
library(fuzzylink)
library(tinytable)

tb <- tribble(~`String 1`, ~`String 2`,
              'AARP', 'American Association of Retired Persons',
              'AARP', 'AAA',
              'USPS', 'US Post Office',
              'USPS', 'UPS',
              'Mike Kelly', 'George Joseph "Mike" Kelly, Jr.',
              'Mike Kelly', 'Mark Edward Kelly',
              'Kit Bond', 'Christopher Samuel Bond',
              'Kit Bond', 'Katie Britt')

# add lexical similarity measures
tb <- tb |>
  mutate(`Levenshtein` = round(stringsim(`String 1`, `String 2`, method = 'lv'), 3),
         `Jaro-Winkler` = round(stringsim(`String 1`, `String 2`, method = 'jw'), 3),
         `Jaccard` = round(stringsim(`String 1`, `String 2`, method = 'jaccard'), 3))

# get embeddings
if(!from_file){
  emb <- get_embeddings(unique(c(tb$`String 1`, tb$`String 2`)))
  save(emb, file = 'data/table1_embeddings.RData')
} else{
  load('data/table1_embeddings.RData')
}

# simiarity matrix (diagonal elements are the name pairs we want)
tb$`Embedding` <- get_similarity_matrix(emb, tb$`String 1`, tb$`String 2`) |> diag()

# format and save table
tb |>
  tt(digits = 3, caption = 'Examples where lexical similarity is a misleading measure of match quality. Best match according to four string distance measures in bold. In each case, conventional measures of string distance choose the wrong match, while embedding distance chooses the correct match.') |>
  style_tt(i = seq(2,nrow(tb), by = 2), j = 3:5, bold = TRUE) |>
  style_tt(i = seq(1,nrow(tb), by = 2), j=c(1,2,6), bold = TRUE) |>
  save_tt('tables/table1.tex', overwrite = TRUE)